Setup the Environment

CSS Styling
writeLines("td, th { padding : 3px } th { background-color:white; color:black; border:1px solid black; text-align:center } td {color:black; border:1px solid black; word-wrap:break-word; white-space:nowrap; overflow: hidden; text-overflow: ellipsis; max-width:300px; text-align:left}", con= "../css/style.css")

Visualization of Metabolites Between Replicate Pairs

Boxplots (Original Abundances)

# Plot Boxplots faceted by shared metabolites
################################################################################
p <- plot_df %>%
  ggplot(aes(y = METABOLITE_NAME, x = VALUE, color = REPLICATE)) +
  geom_boxplot(alpha = 0.8) +
  labs(title=paste0(tissue,' ',study_institute_metab_family,": Original Metabolite Abundances"),
             x = "Abundance", y = "") 
plot(p)

Zero/Missing Values in Metabolites (Original Abundances)

# Identify Zero/Missing Values in Metabolites
################################################################################
na_df <- plot_df %>%
  ungroup() %>% 
  group_by(METABOLITE_NAME, REPLICATE) %>%
  mutate(ZERO_LOG = ifelse(VALUE == 0, 1, 0)) %>%
  mutate(ZERO_N = sum(ZERO_LOG)) %>%
  mutate(ZERO_FREQ = round(ZERO_N/length(shared_sams), digits = 2)) %>%
  mutate(NA_LOG = ifelse(is.na(VALUE), 1, 0)) %>%
  mutate(NA_N = sum(NA_LOG)) %>%
  mutate(NA_FREQ = round(NA_N/length(shared_sams), digits = 2)) %>%
  select(REPLICATE, METABOLITE_NAME, ZERO_N, ZERO_FREQ, NA_N, NA_FREQ) %>%
  unique() %>%
  arrange(REPLICATE, desc(ZERO_FREQ), desc(NA_FREQ)) %>%
  ungroup()

na_df %>%
  knitr::kable(format = "html") %>%
  scroll_box(width = "100%", height = "400px")
REPLICATE METABOLITE_NAME ZERO_N ZERO_FREQ NA_N NA_FREQ
811 Cer(d18:0/14:0) 0 0 0 0
811 Cer(d18:0/16:0) 0 0 0 0
811 Cer(d18:1/18:0) 0 0 0 0
811 Cer(d18:1/20:0) 0 0 0 0
811 Cer(d18:1/22:0) 0 0 0 0
811 Cer(d18:1/24:0) 0 0 0 0
811 Cer(d18:1/24:1) 0 0 0 0
811 Sphinganine 1-phosphate 0 0 0 0
811 Sphinganine 0 0 0 0
811 Sphingosine 0 0 0 0
812 Cer(d18:0/14:0) 0 0 0 0
812 Cer(d18:0/16:0) 0 0 0 0
812 Cer(d18:1/18:0) 0 0 0 0
812 Cer(d18:1/20:0) 0 0 0 0
812 Cer(d18:1/22:0) 0 0 0 0
812 Cer(d18:1/24:0) 0 0 0 0
812 Cer(d18:1/24:1) 0 0 0 0
812 Sphinganine 1-phosphate 0 0 0 0
812 Sphinganine 0 0 0 0
812 Sphingosine 0 0 0 0
# Remove Metabolites with high NAor Zero Frequencies
################################################################################
met_rm <- na_df %>%
  filter((ZERO_FREQ >= 0.95) | (NA_FREQ >= 0.8)) %>%
  select(METABOLITE_NAME) %>% unlist() %>% unique()
metabolites <- metabolites[metabolites %!in% met_rm]
plot_df <- plot_df %>%
  filter(METABOLITE_NAME %in% metabolites)
plot_df$METABOLITE_NAME <- as.character(plot_df$METABOLITE_NAME)

Distributions (Original Abundances)

# Summary Statistics of Density plot distribution
################################################################################
# Iterate through the runs and collect vectors
df_all <- data.frame()
for(rep in c(reps)){
  # Collect numeric vector
  num_vec <- plot_df %>%
    filter(REPLICATE == rep) %>%
    select(VALUE) %>% unlist() %>% unname()
  df <- NumericSummaryStats(num_vec)
  row.names(df) <- rep
  df_all <- rbind(df_all, df)
}
df_all %>%
  knitr::kable(format = "html") %>%
  scroll_box(width = "100%", height = "200px")
NA_COUNT NA_FREQ MEAN MEDIAN MAX MIN MID_RANGE VARIANCE STD_DEV SE_MEAN Q1 Q3 KURTOSIS SKEW BC_LAMBDA
811 0 0 66.55 15.67 603.83 0.23 603.60 10897.71 104.39 3.74 5.61 80.09 5.12 2.29 0.1
812 0 0 69.41 17.32 569.23 0.22 569.01 11566.65 107.55 3.85 5.72 86.59 4.42 2.20 0.1
# Plot the density plot for all the gene counts
################################################################################
plot_df %>%
  ggplot(aes(x = VALUE, color = REPLICATE)) +
  geom_density() +
  xlim(min(df_all$MIN), mean(df_all$Q3)) +
  labs(title=paste0(tissue,' ',study_institute_metab_family,": Original Metabolite Abundances"),
       x = "Abundance",
       y = "Density")

Metabolite by Metabolite Correlation (Original Abundances)

################################################################################
####################### Metabolite by Metabolite Correlations ##################
################################################################################
# Subset Matrices
x <- plot_df %>%
  unite(METABOLITE_NAME,REPLICATE, 
        col = METABOLITE_NAME_REPLICATE, remove = F) %>%
  select(METABOLITE_NAME_REPLICATE, labelid, VALUE,REPLICATE) %>%
  split(.$REPLICATE) %>%
  map(select, -REPLICATE) %>%
  map(pivot_wider, names_from = METABOLITE_NAME_REPLICATE, values_from = VALUE) %>%
  map(arrange, labelid) %>%
  map(tibble::column_to_rownames, var = "labelid") %>%
  map(as.matrix)

# Subset matrices
if(all(row.names(x[[1]]) == row.names(x[[2]]))){
  shared_met_mat <-  do.call(cbind,x) 
}

Corr <- 'Spearman'
# Create an NxN Matrix of correlation
if(Corr == 'Pearson'){
        cor1 <- stats::cor(shared_met_mat, method = 'pearson') %>% round(digits = 3)
}else if(Corr == 'Spearman'){
        cor1 <- stats::cor(shared_met_mat, method = 'spearman') %>% round(digits = 3)
}


# Reorder the correlation matrix
cormat <- cor1

# Melt the correlation matrix
melted_cormat <- melt(cormat, na.rm = TRUE)

################################################################################
####################### Only Reciprocal Pairs ##################################
################################################################################

# Remove rows that compare metabolites from the same dataset, them remove redundent measurements
melted_cormat2 <- melted_cormat 
# Widen the datafrmae back to matrix
cormat2 <- melted_cormat2 %>%
  pivot_wider(names_from = Var2, values_from = value) %>%
  as.data.frame()
row.names(cormat2) <- cormat2$Var1
cormat2 <- cormat2 %>%
  select(-Var1) %>%
  as.matrix()
cormat2 <- cormat2[sort(rownames(cormat2)),sort(colnames(cormat2))]
cormat2 <- cormat2[grepl(reps[1], rownames(cormat2)),grepl(reps[2], colnames(cormat2))]

# Orient the colors and breaks
paletteLength <- 1000
myColor <- colorRampPalette(c("blue", "white", "red"))(paletteLength)
# length(breaks) == length(paletteLength) + 1
# use floor and ceiling to deal with even/odd length pallettelengths
myBreaks <- c(seq(-1, 0, length.out=ceiling(paletteLength/2) + 1), 
              seq(1/paletteLength, max(cor1, na.rm = T), length.out=floor(paletteLength/2)))

# Plot the heatmap
pheatmap(cormat2, color=myColor, breaks=myBreaks, 
                  cluster_rows = F, cluster_cols = F,fontsize = 6)

Scatterplots (Original Abundances)

# Subset the data for sites to compare (scatterplots)
###################################################
plot_join_df <- countdata_df %>%
  ungroup() %>%
  unite(STUDY_INSTITUTE,METAB_FAMILY, col = "STUDY_INSTITUTE_METAB_FAMILY") %>%
  filter(TISSUE == tissue) %>%
  filter(NAMED == named) %>%
  filter(STUDY_INSTITUTE_METAB_FAMILY == study_institute_metab_family) %>%
  unnest(COUNT_DATA) %>%
  filter(viallabel %in% as.character(unlist(sample_join$sample_id))) %>%
  filter(METABOLITE_NAME %in% metabolites) %>%
  unite(labelid, viallabel, col = "labelid_viallabel", remove = F) %>%
  mutate(REPLICATE = case_when(grepl(paste0(reps[1],"$"),labelid_viallabel) ~ reps[1],
                              grepl(paste0(reps[2],"$"),labelid_viallabel) ~ reps[2],
                              )) %>%
  select(REPLICATE,METABOLITE_NAME,labelid,VALUE) %>%
  mutate(REPLICATE = as.character(REPLICATE)) %>%
  split(.$REPLICATE) %>%
  map(~rename(., !!sym(unique(.$REPLICATE)) := "VALUE")) %>%
  map(~select(., -REPLICATE)) %>%
  purrr::reduce(left_join, by = c("labelid","METABOLITE_NAME"))

# Plot scatter plots
################################################################################
# Plot scatter plots ordered by sample (include R2 values)
lm_df <- plot_join_df %>%
  rename(name = METABOLITE_NAME) %>%
  rename(x = reps[1]) %>%
  rename(y = reps[2])
# Iterate through each of the shared metabolites to collect summary info about their correlations
r2_df <- data.frame()
for(metab in metabolites){
  met_df <- lm_df %>%
    filter(name == metab)
  r2_df <- rbind(r2_df, lm_eqn(met_df))
}
# Display summaries
met_r2_df <- r2_df %>%
  arrange(METABOLITE) %>%
  select(METABOLITE,R2) %>%
  arrange(desc(R2))
met_r2_df %>%
  knitr::kable(format = "html") %>%
  scroll_box(width = "100%", height = "200px")
METABOLITE R2
Sphingosine 0.539
Cer(d18:0/16:0) 0.483
Cer(d18:1/18:0) 0.465
Cer(d18:0/14:0) 0.33
Cer(d18:1/24:1) 0.241
Cer(d18:1/24:0) 0.235
Sphinganine 0.228
Cer(d18:1/20:0) 0.159
Cer(d18:1/22:0) 0.0687
Sphinganine 1-phosphate 0.0208
# Collect the order of metabolites
scat_met_order <- met_r2_df %>% select(METABOLITE) %>% unlist() %>% as.character()

# Plot the scatter plots
for(metab in scat_met_order){
  p <- plot_join_df %>%
    filter(METABOLITE_NAME == metab) %>%
  ggplot(aes(x = !!sym(reps[1]), y = !!sym(reps[2])), color = ) +
  geom_point(alpha = 0.5) +
  geom_smooth(method = "lm", size = 1) +
  geom_abline(linetype="dashed") +
  facet_wrap(~ METABOLITE_NAME) +
  expand_limits(x = 0, y = 0) +
  #labs(title=paste0(tissue,' ',study_institute_metab_family,": Normalized Metabolite Abundances")) +
  coord_fixed(ratio=1)
  plot(p)
}

Sample by Sample Correlations (Original Abundances)

################################################################################
####################### Sample by Sample Correlations ##########################
################################################################################

# Subset Data
################################################################################
# Original NxP Matrix
x <- plot_df %>%
  unite(labelid,REPLICATE, 
        col = labelid_REPLICATE, remove = F) %>%
  select(labelid_REPLICATE, METABOLITE_NAME, VALUE,REPLICATE) %>%
  split(.$REPLICATE) %>%
  map(select, -REPLICATE) %>%
  map(pivot_wider, names_from = METABOLITE_NAME, values_from = VALUE) %>%
  map(tibble::column_to_rownames, var = "labelid_REPLICATE") %>%
  map(as.matrix)
# Subset matrices
shared_sam_mat <-  do.call(rbind,x) %>% 
  t()

# Create an NxN Matrix of correlation
Corr <- 'Spearman'
if(Corr == 'Pearson'){
        cor1 <- stats::cor(shared_sam_mat, method = 'pearson') %>% round(digits = 3)
}else if(Corr == 'Spearman'){
        cor1 <- stats::cor(shared_sam_mat, method = 'spearman') %>% round(digits = 3)
}

# Melt the correlation matrix
melted_cormat <- melt(cor1, na.rm = TRUE)

################################################################################
####################### Unclustered ############################################
################################################################################

# Remove rows that compare metabolites from the same dataset, them remove redundent measurements
melted_cormat2 <- melted_cormat

# Widen the dataframwe back to matrix
cormat2 <- melted_cormat2 %>%
  pivot_wider(names_from = Var2, values_from = value) %>%
  as.data.frame()
row.names(cormat2) <- cormat2$Var1
cormat2 <- cormat2 %>%
  select(-Var1) %>%
  as.matrix()
cormat2 <- cormat2[sort(rownames(cormat2)),sort(colnames(cormat2))]
cormat2 <- cormat2[grepl(reps[1], rownames(cormat2)),grepl(reps[2], colnames(cormat2))]

# Orient the colors and breaks
paletteLength <- 1000
myColor <- colorRampPalette(c("blue", "white", "red"))(paletteLength)
# length(breaks) == length(paletteLength) + 1
# use floor and ceiling to deal with even/odd length pallettelengths
myBreaks <- c(seq(-1, 0, length.out=ceiling(paletteLength/2) + 1), 
              seq(1/paletteLength, max(cor1, na.rm = TRUE), length.out=floor(paletteLength/2)))

# Plot the heatmap
pheatmap(as.matrix(cormat2), color=myColor, breaks=myBreaks, 
                  cluster_rows = F, cluster_cols = F,fontsize = 6)

PCA Analysis (Original Abundances)

pca <- prcomp(na.omit(t(shared_sam_mat)), scale. = F)
percentVar <- pca$sdev^2/sum(pca$sdev^2)
PC <- pca$x %>% as.data.frame()
PC$labelid_rep <- as.character(row.names(pca$x))
PC <- PC %>%
  tidyr::separate(col = labelid_rep, into = c('labelid', 'REPLICATE'),
  sep = "_", remove = T)
# Join the phenotype data
PC <- left_join(PC, pheno_df, by = 'labelid')

p0 <- PC %>%
  ggplot(aes(x = PC1, y = PC2, color=REPLICATE)) +
  geom_point(size = 3) +
  geom_line(aes(group = labelid),color="dark grey") +
  ggtitle('Shared Samples and Shared Metabolites') + 
  xlab(paste0('PC1: ',round(percentVar[1]*100,2),'% variance')) +
  ylab(paste0('PC2: ',round(percentVar[2]*100,2),'% variance')) +
  theme(aspect.ratio=1)
p0

p1 <- PC %>%
        ggplot(aes(x = PC1, y = PC2, color=Key.anirandgroup, shape = REPLICATE)) +
        geom_point(size = 3) +
  geom_line(aes(group = labelid),color="dark grey") +
        ggtitle('Shared Samples and Shared Metabolites') + 
        xlab(paste0('PC1: ',round(percentVar[1]*100,2),'% variance')) +
        ylab(paste0('PC2: ',round(percentVar[2]*100,2),'% variance')) +
        scale_color_manual(values=ec_colors) +
        theme(aspect.ratio=1)
p1

PC$Registration.sex <- as.character(PC$Registration.sex)
p2 <- PC %>%
        ggplot(aes(x = PC1, y = PC2, color=Registration.sex, shape = REPLICATE)) +
        geom_point(size = 3) +
  geom_line(aes(group = labelid),color="dark grey") +
        ggtitle('Shared Samples and Shared Metabolites') + 
        xlab(paste0('PC1: ',round(percentVar[1]*100,2),'% variance')) +
        ylab(paste0('PC2: ',round(percentVar[2]*100,2),'% variance')) +
        theme(aspect.ratio=1)
p2

Normalization (AutoScale)

# Normalize
# Note in this step, we use labelid to account for mayo's duplicate samples
################################################################################
norm_df <- plot_df %>%
  unite(labelid, viallabel, col = "labelid_viallabel", remove = F) %>%
  split(.$REPLICATE) %>%
  map(select, labelid_viallabel, METABOLITE_NAME, VALUE) %>%
  map(pivot_wider, names_from = METABOLITE_NAME, values_from = VALUE) %>%
  map(column_to_rownames, var = "labelid_viallabel") %>%
  map(as.matrix) %>%
  map(AutoScaleMatrix) %>%
  map(as.data.frame) %>%
  map(rownames_to_column, var = "labelid_viallabel") %>%
  map(pivot_longer, names_to = 'METABOLITE_NAME', values_to = 'VALUE', cols = all_of(metabolites)) %>%
  map(mutate, REPLICATE = case_when(grepl(paste0(reps[1],"$"),labelid_viallabel) ~ reps[1],
                                     grepl(paste0(reps[2],"$"),labelid_viallabel) ~ reps[2],
                                    grepl(paste0(reps[3],"$"),labelid_viallabel) ~ reps[3])) %>%
  bind_rows()

Boxplots (Normalized Abundances)

# Plot Boxplots faceted by shared metabolites
################################################################################
norm_df %>%
  ggplot(aes(y = METABOLITE_NAME, x = VALUE, color = REPLICATE)) +
  geom_boxplot(alpha = 0.8) +
  labs(title=paste0(tissue,' ',study_institute_metab_family,": Normalized Metabolite Abundances"),
             x = "Abundance", y = "") 

Metabolite by Metabolite Correlation (Normalized Abundances)

################################################################################
####################### Metabolite by Metabolite Correlations ##################
################################################################################
# Subset Matrices
x <- plot_df %>%
  unite(METABOLITE_NAME,REPLICATE, 
        col = METABOLITE_NAME_REPLICATE, remove = F) %>%
  select(METABOLITE_NAME_REPLICATE, labelid, VALUE,REPLICATE) %>%
  split(.$REPLICATE) %>%
  map(select, -REPLICATE) %>%
  map(pivot_wider, names_from = METABOLITE_NAME_REPLICATE, values_from = VALUE) %>%
  map(arrange, labelid) %>%
  map(tibble::column_to_rownames, var = "labelid") %>%
  map(as.matrix) %>%
  map(AutoScaleMatrix)

# Subset matrices
if(all(row.names(x[[1]]) == row.names(x[[2]]))){
  shared_met_mat <-  do.call(cbind,x) 
}

Corr <- 'Spearman'
# Create an NxN Matrix of correlation
if(Corr == 'Pearson'){
        cor1 <- stats::cor(shared_met_mat, method = 'pearson') %>% round(digits = 3)
}else if(Corr == 'Spearman'){
        cor1 <- stats::cor(shared_met_mat, method = 'spearman') %>% round(digits = 3)
}

# Reorder the correlation matrix
cormat <- cor1

# Melt the correlation matrix
melted_cormat <- melt(cormat, na.rm = TRUE)

################################################################################
####################### Only Reciprocal Pairs ##################################
################################################################################

# Remove rows that compare metabolites from the same dataset, them remove redundent measurements
melted_cormat2 <- melted_cormat 
# Widen the datafrmae back to matrix
cormat2 <- melted_cormat2 %>%
  pivot_wider(names_from = Var2, values_from = value) %>%
  as.data.frame()
row.names(cormat2) <- cormat2$Var1
cormat2 <- cormat2 %>%
  select(-Var1) %>%
  as.matrix()
cormat2 <- cormat2[sort(rownames(cormat2)),sort(colnames(cormat2))]
cormat2 <- cormat2[grepl(reps[1], rownames(cormat2)),grepl(reps[2], colnames(cormat2))]

# Orient the colors and breaks
paletteLength <- 1000
myColor <- colorRampPalette(c("blue", "white", "red"))(paletteLength)
# length(breaks) == length(paletteLength) + 1
# use floor and ceiling to deal with even/odd length pallettelengths
myBreaks <- c(seq(-1, 0, length.out=ceiling(paletteLength/2) + 1), 
              seq(1/paletteLength, max(cor1, na.rm = T), length.out=floor(paletteLength/2)))

# Plot the heatmap
pheatmap(cormat2, color=myColor, breaks=myBreaks, 
                  cluster_rows = F, cluster_cols = F,fontsize = 6)

Scatterplots (Normalized Abundances)

# Normalize
# Subset the data for sites to compare (scatterplots)
###################################################
norm_join_df <- countdata_df %>%
  ungroup() %>%
  unite(STUDY_INSTITUTE,METAB_FAMILY, col = "STUDY_INSTITUTE_METAB_FAMILY") %>%
  filter(TISSUE == tissue) %>%
  filter(NAMED == named) %>%
  filter(STUDY_INSTITUTE_METAB_FAMILY == study_institute_metab_family) %>%
  unnest(COUNT_DATA) %>%
  filter(viallabel %in% as.character(unlist(sample_join$sample_id))) %>%
  unite(labelid, viallabel, col = "labelid_viallabel", remove = F) %>%
  mutate(REPLICATE = case_when(grepl(paste0(reps[1],"$"),labelid_viallabel) ~ reps[1],
                                     grepl(paste0(reps[2],"$"),labelid_viallabel) ~ reps[2],
                               grepl(paste0(reps[3],"$"),labelid_viallabel) ~ reps[3])) %>%
  split(.$REPLICATE) %>%
  map(select, labelid_viallabel, METABOLITE_NAME, VALUE) %>%
  map(pivot_wider, names_from = METABOLITE_NAME, values_from = VALUE) %>%
  map(column_to_rownames, var = "labelid_viallabel") %>%
  map(as.matrix) %>%
  map(AutoScaleMatrix) %>%
  map(as.data.frame) %>%
  map(rownames_to_column, var = "labelid_viallabel") %>%
  map(pivot_longer, names_to = 'METABOLITE_NAME', values_to = 'VALUE', cols = all_of(metabolites)) %>%
  map(mutate, REPLICATE = case_when(grepl(paste0(reps[1],"$"),labelid_viallabel) ~ reps[1],
                                     grepl(paste0(reps[2],"$"),labelid_viallabel) ~ reps[2],
                                    grepl(paste0(reps[3],"$"),labelid_viallabel) ~ reps[3])) %>%
  map(~rename(., !!sym(unique(.$REPLICATE)) := "VALUE")) %>%
  map(~select(., -REPLICATE)) %>%
  map(mutate, labelid = gsub(pattern = "_..*", replacement = '', labelid_viallabel)) %>%
  map(select, -labelid_viallabel) %>%
  purrr::reduce(left_join, by = c("labelid","METABOLITE_NAME"))

# Plot scatter plots ordered by sample (include R2 values)
lm_df <- norm_join_df %>%
  rename(name = METABOLITE_NAME) %>%
  rename(x = reps[1]) %>%
  rename(y = reps[2])
# Iterate through each of the shared metabolites to collect summary info about their correlations
r2_df <- data.frame()
for(metab in metabolites){
  met_df <- lm_df %>%
    filter(name == metab)
  r2_df <- rbind(r2_df, lm_eqn(met_df))
}
# Display summaries
met_r2_df <- r2_df %>%
  arrange(METABOLITE) %>%
  select(METABOLITE,R2) %>%
  arrange(desc(R2))
met_r2_df %>%
  knitr::kable(format = "html") %>%
  scroll_box(width = "100%", height = "200px")
METABOLITE R2
Sphingosine 0.539
Cer(d18:0/16:0) 0.483
Cer(d18:1/18:0) 0.465
Cer(d18:0/14:0) 0.33
Cer(d18:1/24:1) 0.241
Cer(d18:1/24:0) 0.235
Sphinganine 0.228
Cer(d18:1/20:0) 0.159
Cer(d18:1/22:0) 0.0687
Sphinganine 1-phosphate 0.0208
# Collect the order of metabolites
scat_met_order <- met_r2_df %>% select(METABOLITE) %>% unlist() %>% as.character()

# Plot scatter plots (Normalized)
################################################################################
# Plot the scatter plots
for(metab in scat_met_order){
  p <- norm_join_df %>%
    filter(METABOLITE_NAME == metab) %>%
  ggplot(aes(x = !!sym(reps[1]), y = !!sym(reps[2])), color = ) +
  geom_point(alpha = 0.5) +
  geom_smooth(method = "lm", size = 1) +
  geom_abline(linetype="dashed") +
  facet_wrap(~ METABOLITE_NAME) +
  expand_limits(x = 0, y = 0) +
  #labs(title=paste0(tissue,' ',study_institute_metab_family,": Normalized Metabolite Abundances")) +
  coord_fixed(ratio=1)
  plot(p)
}

# norm_join_df %>%
#   mutate(METABOLITE_NAME = factor(METABOLITE_NAME, levels = scat_met_order)) %>%
#   ggplot(aes(x = !!sym(reps[1]), y = !!sym(reps[2])), color = ) +
#   geom_point(alpha = 0.5) +
#   geom_smooth(method = "lm", size = 1) +
#   geom_abline(linetype="dashed") +
#   facet_wrap(~ METABOLITE_NAME) +
#   expand_limits(x = 0, y = 0) +
#   labs(title=paste0(tissue,' ',study_institute_metab_family,": Normalized Metabolite Abundances")) +
#   coord_fixed(ratio=1)

Distributions (Normalized Abundances)

# Plot the density plot for all the gene counts
################################################################################
norm_df %>%
  ggplot(aes(x = VALUE, color = REPLICATE)) +
  geom_density() +
  labs(title=paste0(tissue,' ',study_institute_metab_family,": Normalized Metabolite Abundances"),
       x = "Abundance",
       y = "Density")

# Summary Statistics of Density plot distribution
################################################################################
# Iterate through the runs and collect vectors
df_all <- data.frame()
for(rep in reps){
  # Collect numeric vector
  num_vec <- norm_df %>%
    filter(REPLICATE == rep) %>%
    select(VALUE) %>% unlist() %>% unname()
  df <- NumericSummaryStats(num_vec)
  row.names(df) <- rep
  df_all <- rbind(df_all, df)
}
df_all
##     NA_COUNT NA_FREQ MEAN MEDIAN  MAX   MIN MID_RANGE VARIANCE STD_DEV SE_MEAN    Q1   Q3 KURTOSIS SKEW
## 811        0       0    0  -0.18 4.15 -2.17      6.32     0.99    0.99    0.04 -0.67 0.50     1.79 1.13
## 812        0       0    0  -0.18 4.41 -2.11      6.52     0.99    0.99    0.04 -0.77 0.63     0.61 0.80
##     BC_LAMBDA
## 811      None
## 812      None
#}

Sample by Sample Correlations (Normalized Abundances)

################################################################################
####################### Sample by Sample Correlations ##########################
################################################################################

# Subset Data
################################################################################
# Original NxP Matrix
x <- plot_df %>%
  unite(labelid,REPLICATE, 
        col = labelid_REPLICATE, remove = F) %>%
  select(labelid_REPLICATE, METABOLITE_NAME, VALUE,REPLICATE) %>%
  split(.$REPLICATE) %>%
  map(select, -REPLICATE) %>%
  map(pivot_wider, names_from = METABOLITE_NAME, values_from = VALUE) %>%
  map(tibble::column_to_rownames, var = "labelid_REPLICATE") %>%
  map(as.matrix) %>%
  map(AutoScaleMatrix)
# Subset matrices
shared_sam_mat <-  do.call(rbind,x) %>% 
  t()

# Create an NxN Matrix of correlation
Corr <- 'Spearman'
if(Corr == 'Pearson'){
        cor1 <- stats::cor(shared_sam_mat, method = 'pearson') %>% round(digits = 3)
}else if(Corr == 'Spearman'){
        cor1 <- stats::cor(shared_sam_mat, method = 'spearman') %>% round(digits = 3)
}

# Melt the correlation matrix
melted_cormat <- melt(cor1, na.rm = TRUE)

################################################################################
####################### Unclustered ############################################
################################################################################

# Remove rows that compare metabolites from the same dataset, them remove redundent measurements
melted_cormat2 <- melted_cormat

# Widen the dataframwe back to matrix
cormat2 <- melted_cormat2 %>%
  pivot_wider(names_from = Var2, values_from = value) %>%
  as.data.frame()
row.names(cormat2) <- cormat2$Var1
cormat2 <- cormat2 %>%
  select(-Var1) %>%
  as.matrix()
cormat2 <- cormat2[sort(rownames(cormat2)),sort(colnames(cormat2))]
cormat2 <- cormat2[grepl(reps[1], rownames(cormat2)),grepl(reps[2], colnames(cormat2))]

# Orient the colors and breaks
paletteLength <- 1000
myColor <- colorRampPalette(c("blue", "white", "red"))(paletteLength)
# length(breaks) == length(paletteLength) + 1
# use floor and ceiling to deal with even/odd length pallettelengths
myBreaks <- c(seq(-1, 0, length.out=ceiling(paletteLength/2) + 1), 
              seq(1/paletteLength, max(cor1, na.rm = TRUE), length.out=floor(paletteLength/2)))

# Plot the heatmap
pheatmap(as.matrix(cormat2), color=myColor, breaks=myBreaks, 
                  cluster_rows = F, cluster_cols = F,fontsize = 6)

PCA Analysis (Normalized Abundances)

# Plot a PCA with outliers labeled
# shared_sam_mat <- shared_sam_mat %>%
#   t() %>% AutoScaleMatrix() %>%
#   t()
pca <- prcomp(na.omit(t(shared_sam_mat)), scale. = F)
percentVar <- pca$sdev^2/sum(pca$sdev^2)
PC <- pca$x %>% as.data.frame()
PC$labelid_rep <- as.character(row.names(pca$x))
PC <- PC %>%
  tidyr::separate(col = labelid_rep, into = c('labelid', 'REPLICATE'),
  sep = "_", remove = T)
# Join the phenotype data
PC <- left_join(PC, pheno_df, by = 'labelid')

p0 <- PC %>%
  ggplot(aes(x = PC1, y = PC2, color=REPLICATE)) +
  geom_point(size = 3) +
  geom_line(aes(group = labelid),color="dark grey") +
  ggtitle('Shared Samples and Shared Metabolites') + 
  xlab(paste0('PC1: ',round(percentVar[1]*100,2),'% variance')) +
  ylab(paste0('PC2: ',round(percentVar[2]*100,2),'% variance')) +
  theme(aspect.ratio=1)
p0

p1 <- PC %>%
        ggplot(aes(x = PC1, y = PC2, color=Key.anirandgroup, shape = REPLICATE)) +
        geom_point(size = 3) +
  geom_line(aes(group = labelid),color="dark grey") +
        ggtitle('Shared Samples and Shared Metabolites') + 
        xlab(paste0('PC1: ',round(percentVar[1]*100,2),'% variance')) +
        ylab(paste0('PC2: ',round(percentVar[2]*100,2),'% variance')) +
        scale_color_manual(values=ec_colors) +
        theme(aspect.ratio=1)
p1

PC$Registration.sex <- as.character(PC$Registration.sex)
p2 <- PC %>%
        ggplot(aes(x = PC1, y = PC2, color=Registration.sex, shape = REPLICATE)) +
        geom_point(size = 3) +
  geom_line(aes(group = labelid),color="dark grey") +
        ggtitle('Shared Samples and Shared Metabolites') + 
        xlab(paste0('PC1: ',round(percentVar[1]*100,2),'% variance')) +
        ylab(paste0('PC2: ',round(percentVar[2]*100,2),'% variance')) +
        theme(aspect.ratio=1)
p2